View paper on journal site View paper on Altmetric
Genetic clustering algorithms, implemented in programs such as STRUCTURE and ADMIXTURE, have been used extensively in the characterisation of individuals and populations based on genetic data. A successful example is the reconstruction of the genetic history of African Americans as a product of recent admixture between highly differentiated populations. Histories can also be reconstructed using the same procedure for groups that do not have admixture in their recent history, where recent genetic drift is strong or that deviate in other ways from the underlying inference model. Unfortunately, such histories can be misleading. We have implemented an approach, badMIXTURE, to assess the goodness of fit of the model using the ancestry “palettes” estimated by CHROMOPAINTER and apply it to both simulated data and real case studies. Combining these complementary analyses with additional methods that are designed to test specific hypotheses allows a richer and more robust analysis of recent demographic history.
To use the Twitter API, sign up for a developer account and obtain the necessary API tokens. For security, these are stored in a separate _config.yaml file.
#-----------------------------------------------------------------------------
# API keys
#-----------------------------------------------------------------------------
keys <- yaml.load_file("_config.yaml")
## Warning in readLines(con): incomplete final line found on '_config.yaml'
# oauth for tweetscores functions
my_oauth <- list(consumer_key=keys$consumer_key,
consumer_secret=keys$consumer_secret,
access_token=keys$access_token,
access_token_secret=keys$access_secret)
# set token for rtweet
token <- create_token(
app = keys$app_name,
consumer_key = keys$consumer_key,
consumer_secret = keys$consumer_secret,
access_token=keys$access_token,
access_secret=keys$access_secret)
# setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
This code defines functions that will be used in generating and formatting data for the LDA model.
#-----------------------------------------------------------------------------
# functions for topic modeling
#-----------------------------------------------------------------------------
# tmls <- get_timelines(plotdat$account[1:4], n = 3200) # testing with timelines instead of bios
get_follower_bios <- function(followers){
f5k <- unlist(followers)
f5k <- head(f5k, 10000)
lookup_users(f5k, token=token)
}
# get emoji lookup table
source(paste0(datadir, "/scrapeEmoticons.R"))
# source script directly from from https://github.com/today-is-a-good-day/emojis
# script <- getURL("https://raw.githubusercontent.com/today-is-a-good-day/emojis/master/scrapeEmoticons.R", ssl.verifypeer = FALSE)
# eval(parse(text = script))
lookup <- alltogether %>%
mutate(Desc_Sub=tolower(paste0(" emoji", gsub("[+]|-| ", "", Description), " "))) %>% # ensure textualized emoji is separated by space
mutate(Desc_Sub2=gsub(" ", "", Desc_Sub)) # strip spaces for proper text -> emoji matching
# define lookup matching functions
# https://stackoverflow.com/questions/31828218/replace-values-in-character-vector-using-lookup-table-in-r
emoji_to_text <- function(x) Reduce(function(x,r) gsub(lookup$Native[r],lookup$Desc_Sub[r],x,fixed=T),seq_len(nrow(lookup)),x)
text_to_emoji <- function(x) Reduce(function(x,r) gsub(lookup$Desc_Sub2[r],lookup$Native[r],x,fixed=T),seq_len(nrow(lookup)),x)
#-----------------------------------------------------------------------------
# function counts number of test account's followers in each of
# training accounts' follower groups
#-----------------------------------------------------------------------------
match_followers <- function(a1, test_df=training_data_full){
a1 <- unlist(a1)
a2_list <- test_df$followers
match_counts <- lapply(a2_list, function(x){sum(unlist(x) %in% a1)/length(a1)})
names(match_counts) <- test_df$account
return(data.frame(match_counts))
}
#-----------------------------------------------------------------------------
# read reference data from scientist/WN classifications
#-----------------------------------------------------------------------------
# basic data frame containing curated usernames of WN
training_data_fh <- paste0(datadir, "/training_data/training_data_handles.rds")
training_data <- readRDS(training_data_fh)
# extended data frame containing user metadata & follower lists
training_data_full_fh <- paste0(datadir, "/training_data/training_data_full2.rds")
if(file.exists(training_data_full_fh)){
training_data_full <- readRDS(training_data_full_fh)
} else {
training_user_data <- lookup_users(training_data$account) %>%
mutate(account=screen_name) %>%
left_join(training_data, "account")
fc_mod <- ceiling(training_user_data$followers_count/5000)
sleep <- ifelse(sum(fc_mod)>15, 60, 0)
# get followers for accounts in training data
training_data_full <- training_user_data %>% #head
rowwise %>%
mutate(followers=getFollowers(screen_name=account, oauth=my_oauth, sleep=sleep) %>%
data.frame %>%
as.list) # tweetscores
# old version using rtweet API calls--requires fixed n
# mutate(followers=get_followers(account, n=20000, token=token, retryonratelimit = TRUE) %>% as.list)
saveRDS(training_data_full, training_data_full_fh)
}
The code below will scrape data from Altmetric and Twitter to generate a tibble where each row contains the twitter handle of an account that has tweeted or RTed the specified article, and a vector of all accounts that follow that user.
Because this can be very time-consuming, a .rds file containing this tibble will be stored to disk so analyses can be repeated without needing to re-scrape the data.
#-----------------------------------------------------------------------------
# get data from Altmetric
summary_page <- read_html(article_full_url)
doi <- summary_page %>%
html_nodes("div.document-details-table tr:nth-child(3) :nth-child(1)") %>%
html_text()
article_doi <- doi[2]
if(grepl("biorxiv", article_full_url)){
# biorxiv_page <- read_html(paste0("https://www.biorxiv.org/content/", article_doi, "v1"))
abstract <- biorxiv_page %>% html_nodes("div.abstract #p-2") %>% html_text()
} else {
abstract1 <- summary_page %>% html_nodes("div.content-wrapper tr:nth-child(6) :nth-child(1)") %>% html_text()
abstract <- gsub("\n", "", abstract1[2])
}
# altmetric metadata from API
article_am <- altmetrics(doi = article_doi)
article_df <- altmetric_data(article_am)
# id <- article_df$altmetric_id
# url <- paste0(article_base_url, id, "/twitter/page:1")
twitter_url <- paste0(article_full_url, "/twitter")
twitter_page <- read_html(twitter_url)
# number of pages is the ceiling of total tweets/100
totals <- twitter_page %>% html_nodes("div.text strong") %>% html_text()
npages <- ceiling(as.integer(totals[1])/100)
# loop through pages of tweets on altmetric to get handles of tweeting users
# if(!exists("user_data")){
handles <- c()
ts_df <- data.frame()
for(page in 1:npages){
# url <- paste0(article_base_url, id, "/twitter/page:", page)
page_url <- paste0(twitter_url, "/page:", page)
page <- read_html(page_url)
# get div objects containing handles of tweeting users
names <- gsub("@", "", html_nodes(page, "div.handle") %>% html_text())
tweets <- gsub("\n", "", html_nodes(page, "div.content") %>% html_text())
timestamps <- html_nodes(page, "time") %>% html_attrs() %>% unlist()
ts_df <- bind_rows(ts_df, data.frame(names, timestamps, tweets))
# append to list, removing duplicates and stripping "@"
handles <- unique(c(handles, gsub("@", "", names)))
}
original <- ts_df %>%
mutate(tweets=gsub("\"", "", tweets)) %>%
dplyr::filter(!grepl("RT @", tweets)) %>%
mutate(tweets=paste0("@", names, ": ", tweets)) %>%
mutate(tweets=substr(tweets, 1, 100)) %>%
dplyr::select(tweets) %>%
dplyr::group_by(tweets) %>%
count()
rts <- ts_df %>%
mutate(tweets=gsub("\"", "", tweets)) %>%
dplyr::filter(grepl("RT @", tweets)) %>%
mutate(tweets=gsub("RT ", "", tweets)) %>%
# mutate(tweets=gsub("RT @[^:]*: ", "", tweets)) %>%
mutate(tweets=substr(tweets, 1, 100)) %>%
group_by(tweets) %>%
count() %>%
arrange(desc(n))
# get user metadata from Twitter API
user_data <- lookup_users(handles)
# }
#-----------------------------------------------------------------------------
# Get follower lists for users
# due to Twitter API limits, this will take at least N minutes,
# where N is the number of unique users that have tweeted about an article
#
# users with >5,000 followers will require multiple API calls to scrape their
# full follower lists, so a user with 75,000 followers will take
# the same amount of time to process as 15 users with <5,000 followers each
# (~15 minutes)
#-----------------------------------------------------------------------------
# altmetric_data_full_fh <- paste0(datadir, "altmetric_data_full.rds")
altmetric_data_full_fh <- paste0(datadir, "/article_data/altmetric_data_full_", article_id, ".rds")
if(file.exists(altmetric_data_full_fh)){
altmetric_data_full <- readRDS(altmetric_data_full_fh)
} else {
# get follower metadata from Twitter API
# sleep interval--if more than 15 API calls will be required, use one call per minute to
# minimize weird timeout issues
fc_mod <- ceiling(user_data$followers_count/5000)
sleep <- ifelse(sum(fc_mod)>15, 60, 0)
reset <- TRUE
if(!exists("altmetric_data_full") | reset){
altmetric_data_full <- tibble(account=character(), followers=list())
}
for(user in (nrow(altmetric_data_full)+1):nrow(user_data)){
# refresh OAuth handshake
# if(user %% 20 == 0){
# twitteR::setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
# }
if(user_data[user,]$protected){
cat(paste0("User ", user_data[user,]$screen_name, " is protected--skipping\n"))
} else {
cat(paste0("Getting follower list for: ", user_data[user,]$screen_name,
" (", user, " of ", nrow(user_data), ")...\n"))
altmetric_data_user <- user_data[user,] %>%
dplyr::select(account=screen_name) %>% #head
mutate(followers = getFollowers(screen_name=account, oauth=my_oauth, sleep=sleep) %>%
data.frame %>%
as.list)
altmetric_data_full <- bind_rows(list(altmetric_data_full, altmetric_data_user))
}
}
saveRDS(altmetric_data_full, altmetric_data_full_fh)
}
This code scrapes the bios and account names of the followers for each account (up to 10,000), compiling the bios into a single “document” for each account that has tweeted/RTed a reference to the article. Then the frequencies of each term per document are calculated and used to generate a document term matrix, which serves as the input for the LDA model in the next section.
Terms include emoji and hashtags, but exclude common terms in the stop_words dataset as well as words in a custom stop word list that are frequently seen in bios and are uninformative.
#-----------------------------------------------------------------------------
# topic modeling for bios
#-----------------------------------------------------------------------------
# altmetric_follower_fh <- "/mnt/norbert/home/jedidiah/projects/hbd_meme_network/follower_bios2.rds"
altmetric_follower_fh <- paste0(datadir, "/article_data/follower_bios_", article_id, ".rds")
reset <- TRUE
if(file.exists(altmetric_follower_fh)){
out_df <- readRDS(altmetric_follower_fh)
} else {
if(!exists("out_df") | reset){
out_df <- data.frame(account=character(), bios=character())
}
for(i in (length(unique(out_df$account))+1):nrow(altmetric_data_full)){
cat(paste0("Getting follower bios for: ", altmetric_data_full[i,]$account,
" (", i, " of ", nrow(altmetric_data_full), ")...\n"))
if(length(unlist(altmetric_data_full[i,]$followers)) > 0){
bios <- try(get_follower_bios(followers=altmetric_data_full[i,]$followers))
if(!inherits(bios, "try-error")){
acct_follower_bios <- data.frame(account=altmetric_data_full[i,]$account, bios)
out_df <- bind_rows(list(out_df, acct_follower_bios))
} else {
cat(paste0("Encountered error for user--results will not be included\n"))
}
Sys.sleep(10)
}
}
saveRDS(out_df, altmetric_follower_fh)
}
# convert emoji and hashtags to text, collapse follower bios
bios <- out_df %>%
dplyr::filter(account_lang=="en") %>%
dplyr::select(account, bio=description, name) %>%
# mutate(bio=ifelse(nchar(bio)!=0, bio, "missingbio")) %>%
# unite(description, c(name, bio), sep=" ") %>%
mutate(description= paste0(name, " ", bio)) %>%
mutate(description = emoji_to_text(description)) %>%
mutate(description = gsub("-", "", description)) %>%
mutate(description = gsub("#", "hashtag", description)) %>%
group_by(account) %>%
summarise(doc=paste(description, collapse = " ")) %>%
mutate(doc=iconv(doc, 'utf-8', 'ascii', sub=''))
# tokenize, drop spanish/french stop words, URLs, and custom stopword list
custom_stopwords <- c("https", "http", "t.co", "gmail.com", "views", "love", "lover", "tweets",
"rts", "follow", "twitter", "endorsement", "fan", "james", "michael",
"andrew", "ryan", "chris", "matt", "och", "rt", "opinions", "paul",
"endorsements", "account", "life", "john", "david", "social", "retweets",
stopwords(kind="en"), stopwords(kind="danish"), stopwords(kind="dutch"),
stopwords(kind="finnish"), stopwords(kind="french"), stopwords(kind="german"),
stopwords(kind="hungarian"), stopwords(kind="italian"), stopwords(kind="norwegian"),
stopwords(kind="portuguese"), stopwords(kind="russian"), stopwords(kind="spanish"),
stopwords(kind="swedish"))
bios_tokenized <- bios %>%
unnest_tokens(word, doc) %>%
dplyr::filter(!(word %in% custom_stopwords))
# apply default stopword list and count frequencies
word_counts <- bios_tokenized %>%
anti_join(stop_words) %>%
count(account, word, sort = TRUE) %>%
dplyr::filter(n>=10)
#-----------------------------------------------------------------------------
# TESTING use 2-word tokens?
#-----------------------------------------------------------------------------
# bios_tokenized2 <- bios %>%
# unnest_tokens(ngram, doc, token = "ngrams", n = 2) %>%
# dplyr::filter(!(grepl(paste(custom_stopwords, collapse="|"), ngram)))
#
# ngram_counts <- bios_tokenized2 %>%
# count(account, ngram, sort = TRUE) %>%
# dplyr::filter(n>=5) %>%
# separate(ngram, c("word", "word2"), sep = " ") %>%
# anti_join(stop_words) %>%
# dplyr::rename(word1=word, word=word2) %>%
# anti_join(stop_words) %>%
# unite(word, c("word1", "word"), sep=" ")
#-----------------------------------------------------------------------------
# run over entire dataset
#-----------------------------------------------------------------------------
bios_dtm <- word_counts %>%
cast_dtm(account, word, n)
# bios_dtm <- bind_rows(word_counts, ngram_counts) %>%
# cast_dtm(account, word, n)
#
# bios_dtm <- ngram_counts %>%
# cast_dtm(account, word, n)
Using the ldatuning package, calculate various goodness-of-fit metrics
result <- FindTopicsNumber(
bios_dtm,
topics = seq(from = 2, to = 20, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
control = list(seed = 5678),
method = "VEM",
# method = "Gibbs",
mc.cores = 8L,
verbose = TRUE
)
# FindTopicsNumber_plot(result[result$Deveaud2014!=Inf,])
FindTopicsNumber_plot(result)
This code runs the LDA model to represent the corpus of documents as a mixture of K “topics.” Each topic is represented by a different set of words/terms that frequently co-occur. The gamma values are interpreted as the fraction of each document corresponding to each of the K topics.
bios_lda6 <- LDA(bios_dtm, k = 12, control = list(alpha=0.1, seed = 5678), method="VEM")
# bios_lda6 <- LDA(bios_dtm, k = 6, control = list(seed = 5678), method="Gibbs")
# bios_lda6 <- LDA(bios_dtm, k = 6, control = list(seed = 1234))
# bios_lda
bios_lda_td <- tidy(bios_lda6)
# bios_lda_td
top_terms <- bios_lda_td %>%
group_by(topic) %>%
top_n(30, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = text_to_emoji(term)) %>%
mutate(term = gsub("hashtag", "#", term))
topics_terms <- top_terms %>%
dplyr::select(-beta) %>%
# mutate(topic=paste0("t", topic)) %>%
group_by(topic) %>%
summarise(top_10=paste(term, collapse=", ")) %>%
ungroup()
bios_lda_gamma <- tidy(bios_lda6, matrix = "gamma") %>%
rowwise() %>%
mutate(gamma=gamma+runif(1,0,0.0001))
docs_order <- bios_lda_gamma %>%
group_by(document) %>%
arrange(topic, -gamma) %>%
top_n(1, gamma) %>%
rename(topic_group = topic) %>%
dplyr::select(-gamma)
# docs_order <- bios_lda_gamma %>%
# spread(topic, gamma) %>%
# # dplyr::arrange(-`1`, -`2`, -`3`, -`4`, -`5`, -`6`) %>%
# dplyr::arrange(-`1`, -`2`, -`3`, -`4`, -`5`) %>%
# # gather(topic, gamma, `1`:`6`) %>% #head
# gather(topic, gamma, `1`:`5`) %>% #head
# group_by(document) %>%
# # arrange(topic, -gamma) %>%
# top_n(1, gamma) %>%
# rename(topic_group = topic) %>%
# dplyr::select(-gamma)
# devtools::install_github("GuangchuangYu/emojifont")
# library(emojifont)
# load.emojifont('OpenSansEmoji.ttf')
This code runs a plot showing the topic breakdowns of each account’s follower base.
cols <- c(brewer.pal(9, "Set1"), brewer.pal(9, "Set3"))
p <- bios_lda_gamma %>%
# left_join(altmetric_data_full %>%
# rowwise() %>%
# mutate(nfol=length(followers)) %>%
# ungroup() %>%
# mutate(pct_fol=nfol/sum(nfol)) %>%
# dplyr::select(document=account, pct_fol), by="document") %>%
mutate(document=factor(document, levels=docs_order$document)) %>%
left_join(topics_terms, by="topic") %>%
mutate(topic=paste0(topic, ": ", top_10)) %>%
mutate(topic=factor(topic, levels=unique(topic))) %>%
ggplot(aes(x=document, y=gamma, fill=topic))+
# geom_bar(aes(width=pct_fol), stat="identity", position="stack")+
geom_bar(stat="identity", position="stack")+
scale_fill_manual(values=cols)+
scale_y_continuous(expand = c(0,0))+
scale_x_discrete(position = "top")+
xlab("Account")+
theme(legend.position="bottom",
axis.title.y=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())+
guides(fill=guide_legend(ncol=1))
p2 <- ggplotly(p) %>%
layout(legend = list(orientation = "v", # show entries horizontally
xanchor = "center", # use center of legend as anchor
yanchor = "bottom",
x = 0, y=-1))
p2
htmlwidgets::saveWidget(p2,
file=paste0(datadir, "/figs/topic_breakdown_by_user_", article_id, ".html"),
title=paste0("topic_breakdown_by_user_", article_id))
# ggsave(paste0(datadir, "/topic_breakdown_by_user_", article_id, ".png"), plot=p, width = 18, height=9)
p_weight <- bios_lda_gamma %>%
left_join(altmetric_data_full %>%
rowwise() %>%
mutate(nfol=length(followers)) %>%
ungroup() %>%
mutate(right=cumsum(nfol), left=right-nfol) %>%
dplyr::select(document=account, left, right), by="document") %>%
mutate(document=factor(document, levels=docs_order$document)) %>%
left_join(topics_terms, by="topic") %>%
mutate(topic=paste0(topic, ": ", top_10)) %>%
mutate(topic=factor(topic, levels=unique(topic))) %>%
ggplot(aes(x=document, y=gamma, fill=topic, group=document))+
# geom_bar(aes(width=pct_fol), stat="identity", position="stack")+
geom_bar(stat="identity", position="stack")+
geom_rect(aes(xmin = left, xmax = right, ymin=0, ymax=gamma)) +
scale_fill_manual(values=cols)+
scale_y_continuous(expand = c(0,0))+
facet_wrap(~topic, scales="free", ncol=1)+
xlab("Account")+
# theme_classic()+
theme(legend.position="none",
axis.title.y=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())+
guides(fill=guide_legend(ncol=1))
p_weight
ggsave(paste0(datadir, "/figs/topic_breakdown_by_user_weight_", article_id, ".png"),
plot=p_weight, width = 12, height=12)
Apply PCA followed by UMAP to get 2D embedding of accounts according to homophily with reference data
#-----------------------------------------------------------------------------
# Generate follower overlap matrix:
# 1 row per test account, i, 1 column per reference account, j
# each cell contains fraction of i's followers that also follow j
#-----------------------------------------------------------------------------
sim_matrix <- altmetric_data_full %>%
# sim_matrix <- altmetric_data_full %>%
dplyr::filter(length(followers)>=10) %>%
group_by(account) %>%
nest() %>%
mutate(test=map(data, ~match_followers(.$followers))) %>% unnest(test)
saveRDS(sim_matrix, paste0(datadir, "/article_data/sim_matrix_", article_id, ".rds"))
# apply PCA to count matrix
sim_matrix_pca <- sim_matrix %>%
dplyr::select(-data) %>%
mutate(sum = rowSums(.[2:41])) %>%
dplyr::filter(sum!=0) %>%
dplyr::select(-sum) %>%
nest() %>%
mutate(pca = purrr::map(data, ~prcomp(.x %>% dplyr::select(-account), center = TRUE, scale = TRUE)),
pca_tidy = purrr::map2(pca, data, ~broom::augment(.x, data = .y))) #%>% unnest() %>% head
# apply UMAP to PCA data
sim_matrix_umap <- sim_matrix_pca[[3]][[1]] %>%
dplyr::select(.fittedPC1:.fittedPC40) %>%
data.frame() %>%
umap(n_neighbors=30, random_state=36262643)
umap_plotdat <- bind_cols(sim_matrix_pca[[3]][[1]], data.frame(sim_matrix_umap$layout)) %>%
left_join(user_data %>% dplyr::rename(account=screen_name),
by="account") %>%
mutate(wn_mean=rowMeans(dplyr::select(.,vdare:NewRightAmerica), na.rm = TRUE),
sc_mean=rowMeans(dplyr::select(.,pastramimachine:girlscientist), na.rm = TRUE)) %>%
mutate(affiliation=log10(wn_mean/(sc_mean+0.001))) %>%
dplyr::filter(sc_mean != 0 & wn_mean != 0) %>%
mutate(urls=paste0("https://twitter.com/", account))
# plotdat2 <- plotdat
hdb_clust <- umap_plotdat %>%
dplyr::select(X1:X2) %>%
as.matrix() %>%
hdbscan(x=., minPts=10)
umap_plotdat$cluster <- as.character(hdb_clust$cluster)
plot_embedding <- function(plotdat){
p <- plotdat %>% # merge with user_data to get followers_count + other info
# replace_na(list(hbd=FALSE)) %>%
ggplot(aes(x=X1, y=X2, label=account, colour=affiliation))+
geom_point(aes(size=log(followers_count)), alpha=0.8)+
scale_colour_gradientn("WN:Scientist Follower Ratio",
colors=rev(brewer.pal(9, "RdBu")),
breaks=seq(-3,3),
labels=c("1:1000", "1:100", "1:10","1:1","10:1","100:1","1000:1"),
limits=c(-3,3))+
theme_dark()+
theme(legend.position="bottom")
ply <- ggplotly(p)
# Clickable points link to profile URL using onRender: https://stackoverflow.com/questions/51681079
ply$x$data[[1]]$customdata <- plotdat$urls
#pp <- add_markers(pp, customdata = ~url)
plyout <- onRender(ply, "
function(el, x) {
el.on('plotly_click', function(d) {
var url = d.points[0].customdata;
//url
window.open(url);
});
}
")
plyout
}
plot_embedding(umap_plotdat)
htmlwidgets::saveWidget(plot_embedding(umap_plotdat),
file=paste0(datadir, "/figs/homophily_ratio_", article_id, ".html"),
title=paste0("homophily_ratio_", article_id))
lda_gammas <- bios_lda_gamma %>%
mutate(document=factor(document, levels=docs_order$document)) %>%
left_join(topics_terms, by="topic") %>%
mutate(topic=paste0(topic, ": ", top_10)) %>%
mutate(topic=factor(topic, levels=unique(topic))) %>%
group_by(document) %>%
arrange(topic, -gamma) %>%
top_n(1, gamma) %>%
rename(account=document)
p2 <- umap_plotdat %>%
left_join(lda_gammas, by="account") %>%
ggplot(aes(x=X1, y=X2, label=account, colour=topic))+
geom_point(size=2)+
scale_colour_manual(values=cols)+
theme_dark()+
theme(legend.position="none")#+
# guides(colour=guide_legend(ncol=1))
ggplotly(p2) #%>%
# layout(legend = list(
# orientation = "h"
# )
# )
htmlwidgets::saveWidget(ggplotly(p2),
file=paste0(datadir, "/figs/homophily_lda_", article_id, ".html"),
title=paste0("homophily_lda_", article_id))
bios_lda_pca_in <- bios_lda_gamma %>%
spread(topic, gamma) %>%
dplyr::select(-document) %>% as.matrix
bios_lda_pca1 <- prcomp(bios_lda_pca_in, center=T, scale=T)
bios_lda_pca <- bios_lda_gamma %>%
spread(topic, gamma) %>%
nest() %>%
mutate(pca = purrr::map(data, ~prcomp(.x %>% dplyr::select(-document), center = TRUE, scale = TRUE)),
pca_tidy = purrr::map2(pca, data, ~broom::augment(.x, data = .y)))
p3 <- bios_lda_pca[[3]][[1]] %>%
rename(account=document) %>%
left_join(lda_gammas, by="account") %>% #head
# inner_join(sim_matrix, by="account") %>%
left_join(user_data %>% dplyr::rename(account=screen_name),
by="account") %>%
# mutate(wn_mean=rowMeans(dplyr::select(.,vdare:NewRightAmerica), na.rm = TRUE),
# sc_mean=rowMeans(dplyr::select(.,pastramimachine:girlscientist), na.rm = TRUE)) %>% #head
# mutate(affiliation=log10(wn_mean/(sc_mean+0.001))) %>%
# mutate(affiliation_binary=ifelse(affiliation<log10(0.5),
# "Science",
# ifelse(affiliation>log10(2),
# "White Nationalist",
# "Neutral"))) %>%
ggplot(aes(x=.fittedPC1, y=.fittedPC2, colour=topic, label=account))+
geom_point(size=2, alpha=0.7)+
scale_colour_manual(values=cols)+
# scale_shape_manual(values=c(3, 1, 7))+
# theme_dark()+
theme(legend.position="bottom")+
guides(colour=guide_legend(ncol=1))
p3_ply <- ggplotly(p3) %>%
layout(legend = list(orientation = "v", # show entries horizontally
xanchor = "center", # use center of legend as anchor
yanchor = "bottom",
x = 0, y=-1))
p3_ply
htmlwidgets::saveWidget(p3_ply,
file=paste0(datadir, "/figs/lda_pca_", article_id, ".html"),
title=paste0("lda_pca_", article_id))
bios_m <- out_df %>%
# dplyr::filter(account_lang=="en") %>%
dplyr::select(account, bio=description, name) %>%
mutate(bio=ifelse(nchar(bio)!=0, 0, 1)) %>%
# unite(description, c(name, bio), sep=" ") %>%
group_by(account) %>%
summarise(tot=n(), nmissing=sum(bio), pct=nmissing/tot) %>% #dim
# dplyr::rename("document"="account") %>%
inner_join(lda_gammas, by="account") %>%
inner_join(umap_plotdat, by="account")
p4 <- bios_m %>%
mutate(topic_num=gsub(":.*", "", topic)) %>%
ggplot(aes(x=topic_num, y=pct, colour=topic, label=account))+
geom_jitter(size=3, alpha=0.6)+
geom_boxplot(outlier.shape=NA)+
scale_colour_manual(values=cols)+
theme(legend.position="bottom",
axis.title.y=element_blank(),
axis.text.x=element_blank())+
guides(colour=guide_legend(ncol=1))
p4_ply <- ggplotly(p4) %>%
layout(legend = list(orientation = "v", # show entries horizontally
xanchor = "center", # use center of legend as anchor
yanchor = "bottom",
x = 0, y=-1))
htmlwidgets::saveWidget(p4_ply,
file=paste0(datadir, "/figs/missing_dist_", article_id, ".html"),
title=paste0("missing_dist_", article_id))
p4_ply
p4a <- bios_m %>%
mutate(topic_num=gsub(":.*", "", topic)) %>%
dplyr::filter(pct<0.5) %>%
ggplot(aes(x=pct, y=wn_mean, group=topic_num, colour=topic, label=account))+
geom_point()+
geom_smooth(method="lm", se=F)+
scale_colour_manual(values=cols)+
# facet_wrap(~topic_num, scales="free")+
xlab("Fraction of followers with missing bios")+
ylab("WN Homophily")+
theme(legend.position="bottom")+
guides(colour=guide_legend(ncol=1))
p4a_ply <- ggplotly(p4a) %>%
layout(legend = list(orientation = "v", # show entries horizontally
xanchor = "center", # use center of legend as anchor
yanchor = "bottom",
x = 0, y=-1))
htmlwidgets::saveWidget(p4a_ply,
file=paste0(datadir, "/figs/missing_homophily_cor_", article_id, ".html"),
title=paste0("missing_homophily_cor_", article_id))
p4a_ply
m <- as.matrix(bios_dtm)
# distMatrix <- dist(m, method="euclidean")
sim <- m / sqrt(rowSums(m * m))
sim <- sim %*% t(sim)
distMatrix <- as.dist(1 - sim)
groups <- hclust(distMatrix,method="ward.D")
dend <- as.dendrogram(groups)
dend_data <- dendro_data(dend, type = "rectangle")
label_order <- dend_data$labels %>%
dplyr::rename("account"="label")
# coldf <- data.frame(topic=c(1,10:12,2:9), col=cols[c(1,10:12,2:9)])
coldf <- data.frame(topic=c(1,10:12,2:9), col=cols[1:12])
lgo <- lda_gammas %>%
mutate(topic=as.numeric(gsub(":.*", "", topic))) %>%
left_join(coldf, by="topic") %>%
ungroup() %>%
left_join(label_order, by="account") %>%
arrange(x)
# ggdendrogram(groups)
#
#
# set("labels_col", "blue")
p_dend <- dend %>%
set("labels", "") %>% # change labels
# set("labels_col", lgo$col) %>% set("labels_cex", 1) %>%
set("leaves_pch", 19) %>% set("leaves_cex", 1) %>% set("leaves_col", lgo$col) %>%
plot
ggsave(paste0(datadir, "/figs/cosine_similarity_", article_id, ".png"),
plot=p_weight, width = 12, height=12)
dmp <- prcomp(as.matrix(distMatrix), center=TRUE, scale=TRUE)
dmp_df <- dmp$x %>%
as_tibble(rownames="account") %>%
inner_join(lda_gammas, by="account")
p5 <- dmp_df %>%
# mutate(topic_num=gsub(":.*", "", topic)) %>%
ggplot(aes(x=PC1, y=PC2, colour=topic, label=account))+
geom_point()+
scale_colour_manual(values=cols)+
theme(legend.position="none")+
guides(colour=guide_legend(ncol=1))
p5_ply <- ggplotly(p5) %>%
layout(legend = list(orientation = "v", # show entries horizontally
xanchor = "center", # use center of legend as anchor
yanchor = "bottom",
x = 0, y=-1))
htmlwidgets::saveWidget(p5_ply,
file=paste0(datadir, "/figs/cosine_pca_", article_id, ".html"),
title=paste0("cosine_pca_", article_id))
p5_ply
dmp_umap <- dmp$x %>% as.data.frame() %>%
umap(n_neighbors=20, random_state=36262643)
dmp_df2 <- dmp_umap$layout %>%
as_tibble(rownames="account") %>%
inner_join(lda_gammas, by="account") %>%
left_join(umap_plotdat, by="account")
p2 <- dmp_df2 %>%
ggplot(aes(x=V1, y=V2, label=account, colour=topic))+
geom_point(aes(size=wn_mean), alpha=0.7)+
scale_colour_manual(values=cols)+
theme_dark()+
theme(legend.position="none")#+
# guides(colour=guide_legend(ncol=1))
htmlwidgets::saveWidget(ggplotly(p2),
file=paste0(datadir, "/figs/cosine_umap_", article_id, ".html"),
title=paste0("cosine_umap_", article_id))
ggplotly(p2)
Plot the UMAP embedding with points represented as piecharts showing topic breakdown of each account, not just the top topic
sp_dat <- dmp_df2 %>%
dplyr::select(document=account, top_topic=topic, V1, V2) %>%
inner_join(bios_lda_gamma, by="document") %>%
left_join(topics_terms, by="topic") %>%
mutate(topic=paste0(topic, ": ", top_10)) %>%
dplyr::select(-top_10) %>%
spread(topic, gamma)
ggplot()+
geom_scatterpie(aes(x=V1, y=V2, group=document),
data=sp_dat,
cols=names(sp_dat)[5:16],
color=NA,
alpha=0.7)+
scale_fill_manual(values=cols)+
theme_dark()+
theme(legend.position="none")
p_times <- ts_df %>%
rename(account=names) %>%
left_join(dmp_df2, by="account") %>%
group_by(tweets) %>%
arrange(timestamps) %>%
mutate(order=row_number(), n=n()) %>%
dplyr::filter(n>3) %>%
ungroup() %>%
ggplot(aes(x=timestamps, y=order, group=tweets, label=account))+
geom_line(colour="grey80")+
geom_point(aes(colour=topic, size=wn_mean), alpha=0.5)+
scale_colour_manual(values=cols)+
scale_y_log10()+
scale_x_discrete(breaks=ts_df$timestamps[seq(1, nrow(ts_df), 10)])+
ylab("Retweet Number")+
theme_classic()+
theme(axis.title.x=element_blank(),
axis.text.x=element_text(size=6, angle=45, hjust=1),
legend.position="none")
htmlwidgets::saveWidget(ggplotly(p_times),
file=paste0(datadir, "/figs/timeline_", article_id, ".html"),
title=paste0("timeline_", article_id))
ggplotly(p_times)
# tweets_cleaned <- ts_df %>%
# rename("account"="names") %>%
# dplyr::filter(!grepl("RT @", tweets)) %>%
# mutate(tweets=gsub("http.*", "", tweets)) %>%
# mutate(tweets=gsub("\"", "", tweets)) %>%
# mutate(description = emoji_to_text(tweets)) %>%
# mutate(description = gsub("-", "", description)) %>%
# mutate(description = gsub("#", "hashtag", description)) %>%
# left_join(lda_gammas, by="account") %>%
# group_by(topic) %>%
# summarise(doc=paste(description, collapse = " ")) %>%
# mutate(doc=iconv(doc, 'utf-8', 'ascii', sub='')) #%>% #head
# mutate(tweets=substr(tweets, 1, 100)) %>%
# dplyr::select(tweets) %>%
# dplyr::group_by(tweets) %>%
# count() %>%
# tidyr::separate(tweets, into=c("user", "tweet"), sep=": ", remove=FALSE) %>% #head
title_sim <- ts_df %>%
# dplyr::filter(!grepl("RT @", tweets)) %>%
mutate(tweets=gsub("http.*", "", tweets)) %>%
mutate(tweets=gsub("\"", "", tweets)) %>%
mutate(description = emoji_to_text(tweets)) %>%
rename("account"="names") %>%
mutate(title_match_score=sim.strings(tweets, article_df$title)) %>%
mutate(abstract_match_score=sim.strings(tweets, abstract)) %>%
gather(group, value=sim_score, title_match_score:abstract_match_score) %>% #head
left_join(lda_gammas, by="account") #%>%
# group_by(topic) %>%
# summarise(tot=mean(title_match))
p_title <- title_sim %>%
# ggplot(aes(x=topic, y=sim_score, colour=topic, label=tweets))+
ggplot(aes(x=sim_score, y=topic, fill=topic, label=tweets))+
# geom_line(colour="grey80")+
geom_density_ridges(aes(point_color = topic), scale = 4, alpha = .6, jittered_points = TRUE)+
#theme_ridges() +
# geom_boxplot()+
# geom_jitter()+
facet_wrap(~group)+
# geom_point(aes(colour=topic, size=wn_mean), alpha=0.5)+
scale_fill_manual(values=cols)+
scale_colour_manual(values=cols, aesthetics = "point_color")+
theme_classic()+
theme(axis.text.y=element_blank(),
# axis.text.x=element_text(size=6, angle=45, hjust=1),
legend.position="none")
p_title
## Picking joint bandwidth of 0.0515
## Picking joint bandwidth of 0.108
ggsave(paste0(datadir, "/figs/title_sim_ridges_", article_id, ".png"),
plot=p_title, width = 12, height=12)
## Picking joint bandwidth of 0.0515
## Picking joint bandwidth of 0.108
p_title <- title_sim %>%
ggplot(aes(x=topic, y=sim_score, colour=topic, label=tweets))+
# ggplot(aes(x=sim_score, y=topic, fill=topic, label=tweets))+
# geom_line(colour="grey80")+
# geom_density_ridges(aes(point_color = topic), scale = 4, alpha = .2, jittered_points = TRUE)+
#theme_ridges() +
geom_boxplot()+
geom_jitter()+
facet_wrap(~group)+
# geom_point(aes(colour=topic, size=wn_mean), alpha=0.5)+
scale_colour_manual(values=cols)+
theme_classic()+
theme(axis.text.x=element_blank(),
# axis.text.x=element_text(size=6, angle=45, hjust=1),
legend.position="none")
htmlwidgets::saveWidget(ggplotly(p_title),
file=paste0(datadir, "/figs/title_sim_", article_id, ".html"),
title=paste0("title_sim_", article_id))
ggplotly(p_title)
# word_counts_tweets <- tweets_cleaned %>%
# unnest_tokens(word, doc) %>%
# dplyr::filter(!(word %in% custom_stopwords)) %>%
# dplyr::filter(!(word %in% unlist(strsplit(article_df$title, " "))) %>%
# anti_join(stop_words) %>%
# count(topic, word, sort = TRUE) %>%
# dplyr::filter(n>=2)
#
# #-----------------------------------------------------------------------------
# # run over entire dataset
# #-----------------------------------------------------------------------------
# tweets_dtm <- word_counts_tweets %>%
# cast_dtm(topic, word, n)
#
# tweets_lda6 <- LDA(tweets_dtm, k = 5, control = list(alpha=0.1, seed = 5678), method="VEM")
#
# tweets_lda_td <- tidy(tweets_lda6)
#
# top_terms_tweets <- tweets_lda_td %>%
# group_by(topic) %>%
# top_n(10, beta) %>%
# ungroup() %>%
# arrange(topic, -beta) %>%
# mutate(term = text_to_emoji(term)) %>%
# mutate(term = gsub("hashtag", "#", term))
#
#
# top_terms_tweets %>% data.frame
# inner_join(get_sentiments("nrc")) %>% #head(20)
# count(user, sentiment) %>%
# spread(sentiment, nn, fill = 0) %>%
# mutate(sentiment = positive - negative) #%>%
#
# inner_join(get_sentiments("bing")) %>%
# count(book, index = linenumber %/% 80, sentiment) %>%
# spread(sentiment, n, fill = 0) %>%
# mutate(sentiment = positive - negative)